rm(list=ls())

library(survival)
library(reshape2)
library(countrycode)
library(tidyverse)
library(lfe)
library(estprod)

load('binary_distance_globalization.RData')

undergrad_predicted <- read.csv('kw_relevance_predicted_set.csv', stringsAsFactors = FALSE)
undergrad_predicted <- undergrad_predicted %>% select(-starts_with('x')) %>% filter(Predicted == 1)
undergrad_predicted$self_mention <- grepl('[Oo]ur ([a-zA-Z0-9_]+ ){0,2}(border|boundar)', undergrad_predicted$text, perl=TRUE)

self_mentions <- undergrad_predicted %>% filter(self_mention == TRUE) %>% select(country, year) %>% distinct %>% dplyr::rename(country_1 = country) %>% 
  mutate(country_2 = country_1, mention = 1, conttype = 1)
#self_mentions <- self_mentions %>% expand(country_1 = unique(data_binary$country_1), year = unique(data_binary$year)) %>% left_join(self_mentions)
self_mentions[,names(data_binary)[!names(data_binary) %in% names(self_mentions)]] <- NA

data_binary <- bind_rows(data_binary, self_mentions)

data_binary$region_1 <- countrycode(data_binary$country_1, 'iso3c', 'un.regionsub.name')
data_binary$region_1[data_binary$country_1 == 'CSK'] <- 'Eastern Europe'
data_binary$region_1[data_binary$country_1 == 'DDR'] <- 'Eastern Europe'
data_binary$region_1[data_binary$country_1 == 'GE-AB'] <- 'Western Asia'
data_binary$region_1[data_binary$country_1 == 'TWN'] <- 'Eastern Asia'
data_binary$region_1[data_binary$country_1 == 'XKX'] <- 'Southern Europe'
data_binary$region_1[data_binary$country_1 == 'YDM'] <- 'Western Asia'
data_binary$region_1[data_binary$country_1 == 'YUG'] <- 'Southern Europe'

data_binary$region_2 <- countrycode(data_binary$country_2, 'iso3c', 'un.regionsub.name')
data_binary$region_2[data_binary$country_2 == 'CSK'] <- 'Eastern Europe'
data_binary$region_2[data_binary$country_2 == 'DDR'] <- 'Eastern Europe'
data_binary$region_2[data_binary$country_2 == 'GE-AB'] <- 'Western Asia'
data_binary$region_2[data_binary$country_2 == 'TWN'] <- 'Eastern Asia'
data_binary$region_2[data_binary$country_2 == 'XKX'] <- 'Southern Europe'
data_binary$region_2[data_binary$country_2 == 'YDM'] <- 'Western Asia'
data_binary$region_2[data_binary$country_2 == 'YUG'] <- 'Southern Europe'

undergrad_predicted$region <- countrycode(undergrad_predicted$country, 'iso3c', 'un.regionsub.name')
undergrad_predicted$region[undergrad_predicted$region == 'CSK'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$region == 'DDR'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$region == 'GE-AB'] <- 'Western Asia'
undergrad_predicted$region[undergrad_predicted$region == 'TWN'] <- 'Eastern Asia'
undergrad_predicted$region[undergrad_predicted$region == 'XKX'] <- 'Southern Europe'
undergrad_predicted$region[undergrad_predicted$region == 'YDM'] <- 'Western Asia'
undergrad_predicted$region[undergrad_predicted$region == 'YUG'] <- 'Southern Europe'

continent_recode <- c(`Eastern Europe` = 'Europe',
                      `Western Europe` = 'Europe',
                      `Southern Europe` = 'Europe',
                      `Northern Europe` = 'Europe',
                      `Eastern Asia` = 'Asia',
                      `South-eastern Asia` = 'Asia',
                      `Southern Asia` = 'Asia',
                      `Centeral Asia` = 'Asia',
                      `Western Asia` = 'MENA',
                      `Northern Africa` = 'MENA',
                      `Sub-Saharan Africa` = 'Africa',
                      `Latin America and the Caribbean` = 'Americas',
                      `Northern America` = 'Americas',
                      `Polynesia` = 'Oceania',
                      `Micronesia` = 'Oceania',
                      `Melanesia` = 'Oceania',
                      `Australia and New Zealand` = 'Oceania')

data_binary$continent_1 <- recode(data_binary$region_1, 
                                  !!!continent_recode)

data_binary$continent_2 <- recode(data_binary$region_2, 
                                  !!!continent_recode)

# recoding unmatched place mentions to NA
data_binary$country_2[is.na(data_binary$region_2)] <- NA

undergrad_predicted$region <- countrycode(undergrad_predicted$country, 'iso3c', 'un.regionsub.name')
undergrad_predicted$region[undergrad_predicted$country == 'CSK'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$country == 'DDR'] <- 'Eastern Europe'
undergrad_predicted$region[undergrad_predicted$country == 'YDYE'] <- 'Western Asia'
undergrad_predicted$region[undergrad_predicted$country == 'YUG'] <- 'Southern Europe'

undergrad_predicted$continent <- recode(undergrad_predicted$region,
                                 !!!continent_recode)

data_binary$conttype[data_binary$country_1 == data_binary$country_2] <- 1

# Figure 3: self-mention plot
dists <- data_binary %>% filter(mention == 1, !is.na(country_2)) %>% group_by(year) %>% 
  dplyr::summarise(median_dist = median(distance), self_freq = mean(country_1 == country_2 | conttype == 1),
                                                                                      other_freq = mean(conttype == 0))
dists <- dists %>% pivot_longer(c('self_freq', 'other_freq'), 'variable', 'value')

ggplot(dists, aes(x=year, y=value, color=variable)) + geom_smooth(alpha=1, fill='grey85') + geom_point() + theme_minimal() + xlab(NULL) + ylab('Proportion of mentions') +  
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        legend.position='bottom') +
  scale_color_manual(labels=c('Other', 'Self/Neighbor'), name='Mention Type', values=c('red', 'blue'))
